library(tidyverse)
library(janitor)
library(ggplot2)
library(lubridate)
library(viridis)
library(forcats)

Introduction

The goal of this report is to show, which times of the day are more dangerous for which type of crimes. We have data for Seattle and San Francisco and want to analyze the occurrence patterns of type of crimes per time of the day.

Data source

The data to be used is the criminal statistic of the city of Seattle downloadable here: https://github.com/uwescience/datasci_course_materials/tree/master/assignment6

Analysis for Seattle

The data for Seattle looks likes this

seattle <- readr::read_csv("./seattle_incidents_summer_2014.csv") %>%
    clean_names()
## Parsed with column specification:
## cols(
##   `RMS CDW ID` = col_integer(),
##   `General Offense Number` = col_double(),
##   `Offense Code` = col_character(),
##   `Offense Code Extension` = col_integer(),
##   `Offense Type` = col_character(),
##   `Summary Offense Code` = col_character(),
##   `Summarized Offense Description` = col_character(),
##   `Date Reported` = col_character(),
##   `Occurred Date or Date Range Start` = col_character(),
##   `Occurred Date Range End` = col_character(),
##   `Hundred Block Location` = col_character(),
##   `District/Sector` = col_character(),
##   `Zone/Beat` = col_character(),
##   `Census Tract 2000` = col_double(),
##   Longitude = col_double(),
##   Latitude = col_double(),
##   Location = col_character(),
##   Month = col_integer(),
##   Year = col_integer()
## )
glimpse(seattle,witdth=120)
## Observations: 32,779
## Variables: 19
## $ rms_cdw_id                        <int> 483839, 481252, 481375, 4816...
## $ general_offense_number            <dbl> 2015218538, 2015213067, 2015...
## $ offense_code                      <chr> "2202", "2610", "2316", "259...
## $ offense_code_extension            <int> 0, 0, 0, 0, 3, 0, 0, 0, 0, 1...
## $ offense_type                      <chr> "BURGLARY-FORCE-RES", "FRAUD...
## $ summary_offense_code              <chr> "2200", "2600", "2300", "250...
## $ summarized_offense_description    <chr> "BURGLARY", "FRAUD", "MAIL T...
## $ date_reported                     <chr> "06/28/2015 10:31:00 AM", "0...
## $ occurred_date_or_date_range_start <chr> "06/28/2014 10:31:00 AM", "0...
## $ occurred_date_range_end           <chr> "06/28/2015 10:31:00 AM", "0...
## $ hundred_block_location            <chr> "6XX BLOCK OF NW 74 ST", "23...
## $ district_sector                   <chr> "J", "C", "F", "M", "J", "R"...
## $ zone_beat                         <chr> "J2", "C2", "F3", "M2", "J3"...
## $ census_tract_2000                 <dbl> 2900.3013, 6300.1004, 11300....
## $ longitude                         <dbl> -122.3647, -122.2771, -122.3...
## $ latitude                          <dbl> 47.68252, 47.63990, 47.52923...
## $ location                          <chr> "(47.68252427, -122.36467199...
## $ month                             <int> 6, 6, 8, 6, 6, 6, 6, 8, 8, 7...
## $ year                              <int> 2014, 2014, 2014, 2014, 2014...
seattle <- 
    seattle %>% mutate(occurred_date_or_date_range_start=mdy_hms(occurred_date_or_date_range_start),
                 start_hour = hour(occurred_date_or_date_range_start))

First I plot all crime incidents with their starting hour and the district they occur:

ggplot(seattle) +
    geom_jitter(aes(start_hour,summarized_offense_description,color=district_sector),alpha=0.5) +
    labs(x="Hour of day", y="Type of crime",color="District")

This shows clearly, that the crime of ‘Prostitution’ occur mainly in one district(‘N’) concentrated at certain time of the day, namely between mid-day and 2 o clock in the morning.

Further, it seems that district R and S have overall higher crime rates.

Different type of crime occur with different frequency:

tabyl(seattle,summarized_offense_description,sort = T)
##    summarized_offense_description    n      percent
## 1                       CAR PROWL 6230 1.900607e-01
## 2                  OTHER PROPERTY 3755 1.145551e-01
## 3                        BURGLARY 3212 9.798957e-02
## 4                   VEHICLE THEFT 3057 9.326093e-02
## 5                 PROPERTY DAMAGE 2365 7.214985e-02
## 6                         ASSAULT 2018 6.156381e-02
## 7                           FRAUD 1473 4.493731e-02
## 8                     DISTURBANCE 1333 4.066628e-02
## 9                         THREATS 1178 3.593764e-02
## 10                STOLEN PROPERTY 1136 3.465633e-02
## 11                 WARRANT ARREST 1021 3.114799e-02
## 12                    SHOPLIFTING  944 2.879893e-02
## 13                     BIKE THEFT  797 2.431435e-02
## 14                        ROBBERY  736 2.245340e-02
## 15                       TRESPASS  486 1.482657e-02
## 16                      NARCOTICS  391 1.192837e-02
## 17    BURGLARY-SECURE PARKING-RES  388 1.183685e-02
## 18                     MAIL THEFT  264 8.053937e-03
## 19                        TRAFFIC  252 7.687849e-03
## 20                   PROSTITUTION  202 6.162482e-03
## 21                        DISPUTE  171 5.216755e-03
## 22                  LOST PROPERTY  162 4.942189e-03
## 23                    COUNTERFEIT  159 4.850667e-03
## 24                     PICKPOCKET  146 4.454071e-03
## 25                         WEAPON  137 4.179505e-03
## 26               ANIMAL COMPLAINT   96 2.928704e-03
## 27              THEFT OF SERVICES   96 2.928704e-03
## 28       VIOLATION OF COURT ORDER   90 2.745660e-03
## 29                         INJURY   71 2.166021e-03
## 30                        FORGERY   59 1.799933e-03
## 31                       EMBEZZLE   57 1.738918e-03
## 32               LIQUOR VIOLATION   48 1.464352e-03
## 33                       OBSTRUCT   38 1.159279e-03
## 34                            DUI   34 1.037249e-03
## 35             RECOVERED PROPERTY   34 1.037249e-03
## 36                ILLEGAL DUMPING   26 7.931908e-04
## 37                   FALSE REPORT   23 7.016688e-04
## 38               RECKLESS BURNING   23 7.016688e-04
## 39                  BIAS INCIDENT   20 6.101467e-04
## 40                       FIREWORK    9 2.745660e-04
## 41                   PURSE SNATCH    9 2.745660e-04
## 42                        ELUDING    8 2.440587e-04
## 43                       HOMICIDE    8 2.440587e-04
## 44       [INC - CASE DC USE ONLY]    5 1.525367e-04
## 45                PUBLIC NUISANCE    4 1.220293e-04
## 46                         ESCAPE    3 9.152201e-05
## 47                    PORNOGRAPHY    3 9.152201e-05
## 48             DISORDERLY CONDUCT    2 6.101467e-05

This plot shows, that car prowls and other different types of theft / robbery occur most often.

seattle %>%
    count(summarized_offense_description) %>%
    ggplot() +
    geom_col(aes(x=fct_reorder(summarized_offense_description,n) ,y=n)) +
    coord_flip() +
    labs(y="# crimes",
         x="type of crime",
         title="Frequencies of crime types") 

To get a nice overview of the type of crime occurring per day, we will look at a heat map which shows how many crimes(and of which type) occurred at which time of the day.

seattle %>% count(summarized_offense_description,start_hour) %>%
    mutate(total=sum(n)) %>%
    ungroup %>%
    mutate(summarized_offense_description=fct_reorder(summarized_offense_description,n)) %>%
    ggplot() +
    geom_tile(aes(start_hour,summarized_offense_description,fill=n)) +
    scale_fill_viridis(name="# crimes") +
labs(x="Hour of the day",y="type of crime",
     title='Crime types during the day for Seattle')    

This shows that certain types of crimes are concentrated at certain moments of the day.

Analysis of San Francisco

The data of San Francisco similar, but the types of crimes are different.

sf <- readr::read_csv("./sanfrancisco_incidents_summer_2014.csv") %>%
    clean_names()
## Parsed with column specification:
## cols(
##   IncidntNum = col_integer(),
##   Category = col_character(),
##   Descript = col_character(),
##   DayOfWeek = col_character(),
##   Date = col_character(),
##   Time = col_time(format = ""),
##   PdDistrict = col_character(),
##   Resolution = col_character(),
##   Address = col_character(),
##   X = col_double(),
##   Y = col_double(),
##   Location = col_character(),
##   PdId = col_double()
## )
glimpse(sf,width=120)
## Observations: 28,993
## Variables: 13
## $ incidntnum <int> 140734311, 140736317, 146177923, 146177531, 140734220, 140734349, 140734349, 140734349, 14073814...
## $ category   <chr> "ARSON", "NON-CRIMINAL", "LARCENY/THEFT", "LARCENY/THEFT", "NON-CRIMINAL", "DRUG/NARCOTIC", "DRU...
## $ descript   <chr> "ARSON OF A VEHICLE", "LOST PROPERTY", "GRAND THEFT FROM LOCKED AUTO", "GRAND THEFT FROM LOCKED ...
## $ dayofweek  <chr> "Sunday", "Sunday", "Sunday", "Sunday", "Sunday", "Sunday", "Sunday", "Sunday", "Sunday", "Sunda...
## $ date       <chr> "08/31/2014", "08/31/2014", "08/31/2014", "08/31/2014", "08/31/2014", "08/31/2014", "08/31/2014"...
## $ time       <time> 23:50:00, 23:45:00, 23:30:00, 23:30:00, 23:23:00, 23:13:00, 23:13:00, 23:13:00, 23:00:00, 23:00...
## $ pddistrict <chr> "BAYVIEW", "MISSION", "SOUTHERN", "RICHMOND", "RICHMOND", "SOUTHERN", "SOUTHERN", "SOUTHERN", "I...
## $ resolution <chr> "NONE", "NONE", "NONE", "NONE", "NONE", "ARREST, BOOKED", "ARREST, BOOKED", "ARREST, BOOKED", "N...
## $ address    <chr> "LOOMIS ST / INDUSTRIAL ST", "400 Block of CASTRO ST", "1000 Block of MISSION ST", "FULTON ST / ...
## $ x          <dbl> -122.4056, -122.4350, -122.4098, -122.4853, -122.5099, -122.4166, -122.4166, -122.4166, -122.424...
## $ y          <dbl> 37.73832, 37.76177, 37.78004, 37.77252, 37.77231, 37.77391, 37.77391, 37.77391, 37.74665, 37.792...
## $ location   <chr> "(37.7383221869053, -122.405646994567)", "(37.7617677182954, -122.435012093789)", "(37.780035626...
## $ pdid       <dbl> 1.407343e+13, 1.407363e+13, 1.461779e+13, 1.461775e+13, 1.407342e+13, 1.407343e+13, 1.407343e+13...
sf <- 
    sf %>% mutate(start_hour = hour(time))
sf %>%
    count(category) %>%
    ggplot() +
    geom_col(aes(x=fct_reorder(category,n) ,y=n)) + coord_flip()

sf %>% count(category,start_hour) %>%
    mutate(total=sum(n)) %>%
    ungroup %>%
    mutate(category=fct_reorder(category,n)) %>%
    ggplot() +
    geom_tile(aes(start_hour,category,fill=n)) +
    scale_fill_viridis(name="# crimes") +
labs(x="Hour of the day",y="type of crime",
     title="Crime types during the day for San Francisco")    

Compare crime numbers for both cities

As the crime categories are different between both cities, I decided to compare only the over all number of crimes during the day.

both <- count(seattle,start_hour) %>% mutate(city="Seattle") %>% bind_rows(count(sf,start_hour) %>% mutate(city="San Francisco"))
ggplot(both) + geom_line(aes(start_hour,n,color=city)) +
    labs(x="Hour of the day",
         y="# crimes",
         title="# crimes during the day")

This shows, that there is a similar over all trend in both cities. The peak of crime is at midnight, then it decreases up to 5 o’clock, where it has the minimum.

The it increase again up to 12 o’clock, where it reaches an other maximum as high a the nightly maximum.

Overall Seattle has more crimes then San Francisco.

This plot shows this more clear, but the difference is small.

bind_rows(count(seattle),count(sf),.id='city') %>% mutate(city=fct_recode(city,Seattle="1",`San Francisco`="2")) %>% ggplot() + geom_col(aes(x=city,y=n)) + labs(x="City",y="# crimes")